library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.4 ✓ dplyr 1.0.7
## ✓ tidyr 1.1.3 ✓ stringr 1.4.0
## ✓ readr 2.0.1 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(scales)
##
## Attaching package: 'scales'
## The following object is masked from 'package:purrr':
##
## discard
## The following object is masked from 'package:readr':
##
## col_factor
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(lemon)
##
## Attaching package: 'lemon'
## The following object is masked from 'package:purrr':
##
## %||%
## The following objects are masked from 'package:ggplot2':
##
## CoordCartesian, element_render
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
moviesAreTheBest <- read.csv("movie_profit-1.csv")
options(scipen = 999)
#1
# Create a bar chart for worldwide gross by genre
ggplot(data=moviesAreTheBest) +
aes(x=genre, y = worldwide_gross) + # set axis input data
geom_bar(stat = "identity")
#2
# Create a bar chart for worldwide gross by genre for each MPAA Rating
moviesAreTheBest %>%
ggplot(aes(x = genre, y = worldwide_gross)) +
scale_y_continuous(labels = dollar_format()) + # set $ to y-axis values
geom_bar(stat = "identity") +
labs(title = "Genre and Worldwide Gross Revenue by MPAA Rating",
x = "Genre", y = "Worldwide Gross Revenue") + # set axis labels
facet_rep_wrap(~mpaa_rating, repeat.tick.labels = TRUE) # repeat graph for each MPAA rating
#3
#
ggplot(data = moviesAreTheBest) +
aes(x = mpaa_rating, y = worldwide_gross) +
scale_y_continuous(labels = dollar_format()) + # missing parens
geom_bar(stat = "identity") + # missing stat = identity
labs(title = "Worldwide Gross Revenue by MPAA Rating", x = "MPAA Rating", y = "Worldwide Gross Revenue") +
facet_rep_wrap(~genre, repeat.tick.labels = TRUE) #facet_rep_wrap not facet_wrap
Issue #1: Missing end parenthesis on the 3rd line
Issue #2: Missing stat = “identity” argument on 4th line to enable y-axis input data
Issue #3: facet_rep_wrap is the function needed to enable repeat.tick.labels argument
#4
#
moviesAreTheBest %>%
ggplot(aes(x =production_budget, y = domestic_gross)) +
geom_point() +
facet_rep_wrap(~mpaa_rating, repeat.tick.labels = TRUE)
#5
#
moviesAreTheBest %>%
mutate(year = year(release_date)) %>%
group_by(year) %>%
ggplot(aes(x=year)) +
geom_line(stat="bin")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
#6
#
moviesAreTheBest %>%
filter(distributor == "Universal") %>%
ggplot(aes(x = production_budget, fill = mpaa_rating)) +
scale_x_continuous(labels = dollar_format()) +
geom_histogram() +
labs(title = "Distribution of Movies by Production Budget",
x = "Production Budget", y = "Count", fill = "MPAA Rating") +
facet_rep_wrap(~genre, repeat.tick.labels = TRUE)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
For each genre you can see the range of production budgets for various movies on the x-axis, which is then broken down by the MPAA rating. So by looking at the chart, it is evident that rated R and PG-13 movies have the highest production budgets compared to the other ratings. The only exception is for the adventure genre, where many PG rated movies also have the high production budget.
#7 part 1
#
movie_boxplot <- moviesAreTheBest %>%
ggplot(aes(x=mpaa_rating,y=worldwide_gross, fill=mpaa_rating)) +
geom_boxplot(show.legend = FALSE)+
coord_flip()
ggplotly(movie_boxplot)
#7 part 2
#
movie_boxplot <- moviesAreTheBest %>%
filter(distributor == "Universal") %>%
ggplot(aes(x=mpaa_rating,y=worldwide_gross, fill = mpaa_rating)) +
geom_boxplot(show.legend = FALSE) +
coord_flip()
ggplotly(movie_boxplot)
For each boxplot represented, rated PG-13 and rated G movies appeared right-skewed, while PG and rated R movies appeared symmetrical. In addition, all rated movies except rated G movies had upper-limit outliers.
#8 part 1
#
moviesAreTheBest %>%
ggplot(aes(x = genre, y = "", fill = genre)) +
geom_bar(stat = "identity") +
coord_polar("x", start = 0)
In this polar chart, we can see that there are more movies produced in the Drama genre than any other genre. Conversely, Horror is the genre with the least amount of movies being produced.
#8 part 2
#
stackedBarChart <- moviesAreTheBest %>%
filter(distributor %in% c("Universal", "Warner Bros", "Sony Pictures")) %>%
ggplot(aes(x=distributor, fill=mpaa_rating)) +
geom_bar()
ggplotly(stackedBarChart)
Looking at the rating distribution for 3 studios’ films, we can see that these studios produce significantly more PG-13 and R-rated movies than any other rating. For example, Universal has released almost an equal number of PG-13 (119) and R-rated (111) movies, however they have released relatively very few G-rated movies (7). Of these 3 studios, Warner Bros has released the greatest number of R-rated movies (157), and Sony Pictures has released the greatest number of PG-13 movies (138).